archy/tests/lifecycle/bats/backend-survives-archipelago-restart.bats

#!/usr/bin/env bats
# tests/lifecycle/bats/backend-survives-archipelago-restart.bats
#
# Quadlet-everywhere promise (Phase 3 of v1.7.52): backend containers
# (bitcoin-knots / lnd / electrumx) are managed by systemd via Quadlet
# units, NOT parented under archipelago.service's cgroup. Restarting the
# archipelago service must NOT take them down.
#
# This is the regression gate for FM3 (cgroup cascade SIGKILL — observed
# live on .198 on 2026-05-01: stopping archipelago.service killed every
# container in its cgroup, leaving the box in a multi-hour recovery
# loop). Until v1.7.52 Phase 3 ships, this suite is EXPECTED TO FAIL on
# fleet boxes — it serves as the executable definition of "Phase 3
# complete". Do not gate the release on it passing pre-Phase-3.
#
# Sister to companion-survives-archipelago-restart.bats which tests the
# same property for UI companions (already shipping via Quadlet since
# commit 6e716f68).
#
# Gated by ARCHY_ALLOW_DESTRUCTIVE=1 because it bounces archipelago.

backend_units=(
  "bitcoin-knots"
  "bitcoin-core"
  "lnd"
  "electrumx"
)

container_running() {
  local name="$1"
  [[ "$(podman inspect --format '{{.State.Running}}' "$name" 2>/dev/null)" == "true" ]]
}

wait_archipelago_back() {
  local timeout="${1:-60}"
  local deadline=$(( $(date +%s) + timeout ))
  while (( $(date +%s) < deadline )); do
    if curl -fsS -o /dev/null "http://127.0.0.1:5678/health" 2>/dev/null; then
      return 0
    fi
    sleep 2
  done
  return 1
}

@test "destructive gate enabled" {
  [[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set"
}

@test "at least one backend container is running before restart" {
  [[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set"
  local up=0
  for c in "${backend_units[@]}"; do
    if container_running "$c"; then
      up=$(( up + 1 ))
    fi
  done
  (( up > 0 )) || skip "No backends installed on this node"
}

@test "backends survive archipelago restart" {
  [[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set"

  # Snapshot: which backends were up before we touched anything.
  local before=()
  for c in "${backend_units[@]}"; do
    if container_running "$c"; then
      before+=("$c")
    fi
  done
  (( ${#before[@]} > 0 )) || skip "No backends installed on this node"

  # Capture pre-restart container IDs so we can verify the SAME process
  # survives — not "the orchestrator started a fresh container after the
  # cascade SIGKILL'd the original" (which would also be a fail; FM3 is
  # specifically about losing the running container, even if the
  # orchestrator can recreate one minutes later).
  declare -A pre_id
  for c in "${before[@]}"; do
    pre_id["$c"]=$(podman inspect --format '{{.Id}}' "$c" 2>/dev/null || echo "")
  done

  # Bounce archipelago. Same approach as companion-survives-* for parity.
  if systemctl --user list-units --no-legend archipelago.service | grep -q archipelago; then
    systemctl --user restart archipelago.service
  else
    sudo systemctl restart archipelago.service
  fi

  run wait_archipelago_back 60
  [ "$status" -eq 0 ]

  # Every backend that was up before must still be up after, AND it must
  # be the SAME container instance (same .Id). A different .Id means the
  # original was killed and a fresh one was created — that's the FM3
  # failure we're catching.
  for c in "${before[@]}"; do
    run container_running "$c"
    [ "$status" -eq 0 ] || fail "backend $c died across archipelago restart (FM3 cgroup cascade)"

    local post_id
    post_id=$(podman inspect --format '{{.Id}}' "$c" 2>/dev/null || echo "")
    [[ -n "$post_id" ]] || fail "backend $c has no container id after restart"
    [[ "$post_id" == "${pre_id[$c]}" ]] \
      || fail "backend $c was recreated across archipelago restart (FM3): pre=${pre_id[$c]:0:12} post=${post_id:0:12}"
  done
}