archy/tests/lifecycle/bats/use-quadlet-backends-install.bats
archipelago 3cea7dd6c5 test(phase3): fix Phase-3 quadlet gates — define fail(), drop stale Notify=healthy assert
Two Phase-3 bats suites used `fail` (a bats-assert helper) but bats-assert
isn't installed on the alpha fleet (only bats-core), so every tripped
assertion crashed with `fail: command not found` (status 127) instead of
reporting a real pass/fail. Define the same minimal `fail() { echo ...;
return 1; }` the other suites already use (see mempool.bats). Without this
the gates were silently non-functional.

Also rewrite the obsolete "HealthCmd= implies Notify=healthy" assertion in
use-quadlet-backends-install.bats. Phase 3.4's Notify=healthy was
deliberately reverted: gating `systemctl start` on health hung boot
reconciliation for dependency-waiting apps (fedimint idles until Bitcoin
IBD; lnd until macaroon unlock), leaving units stuck "deactivating". The
renderer now emits HealthCmd= for Podman's health state but TimeoutStartSec=0
and NO Notify=healthy (quadlet.rs render() + contains_stale_health_gate()).
The test now asserts the current invariant: no backend unit gates start on
health.

Verified on the .228 canary node (ARCHIPELAGO_USE_QUADLET_BACKENDS=1):
use-quadlet-backends-install 6/6, backend-survives-archipelago-restart 3/3.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-28 16:09:05 -04:00

161 lines
7.1 KiB
Bash

#!/usr/bin/env bats
# tests/lifecycle/bats/use-quadlet-backends-install.bats
#
# Validates the post-condition of Phase 3.2's `use_quadlet_backends`
# install path. When the orchestrator routed at least one backend
# install through `install_via_quadlet`, this suite asserts that the
# resulting state has the four properties the Phase 3 design promises:
#
# 1. A `.container` unit file exists in ~/.config/containers/systemd/
# and is well-formed (required sections + directives).
# 2. The corresponding `.service` is active under `systemctl --user`.
# 3. The container is in `podman ps` (running).
# 4. The container's cgroup is under `user.slice/...`, NOT under
# `archipelago.service` — proving FM3 (cgroup cascade SIGKILL on
# archipelago restart) is structurally fixed for that container.
#
# Auto-skips if no Quadlet-managed backend exists yet — so it runs as a
# no-op on nodes where `use_quadlet_backends` is still false (today's
# default), and turns into a hard regression gate as soon as anyone
# flips the flag and reinstalls.
#
# Run on a node with rootless podman + systemd-user (every alpha-fleet
# box). No env vars required for the read-only checks. The cleanup
# section at the bottom is gated by ARCHY_ALLOW_DESTRUCTIVE=1.
# bats-core ships no `fail`; bats-assert isn't installed on the alpha fleet.
# Define the same minimal helper the other suites use (see mempool.bats) so a
# tripped assertion reports as a real test failure, not a status-127 crash.
fail() { echo "$@" >&2; return 1; }
quadlet_dir() {
echo "${XDG_CONFIG_HOME:-$HOME/.config}/containers/systemd"
}
# List Quadlet `.container` units that correspond to backend containers
# (i.e., NOT companions like archy-*-ui, which already shipped via Quadlet
# in v1.7.41 and have their own coverage in companion-survives-archipelago-
# restart.bats). Echoes one container name per line; empty if none found.
backend_quadlet_units() {
local d
d="$(quadlet_dir)"
[[ -d "$d" ]] || return 0
# Strip the .container extension; filter out archy-*-ui companions.
for f in "$d"/*.container; do
[[ -e "$f" ]] || continue
local name
name="$(basename "$f" .container)"
[[ "$name" =~ ^archy-.*-ui$ ]] && continue
echo "$name"
done
}
# Read the cgroup path of a running container's main process. For
# rootless podman the conmon-run target lands the container's pid1 in
# the cgroup that owns its supervising .service.
container_cgroup_path() {
local name="$1"
local pid
pid="$(podman inspect --format '{{.State.Pid}}' "$name" 2>/dev/null)"
[[ -n "$pid" && "$pid" != "0" ]] || return 1
# cgroup v2 line: "0::/path/to/cgroup"
awk -F: '$1=="0"{print $3}' "/proc/$pid/cgroup" 2>/dev/null
}
# Per-test gate. Each @test calls this so the suite is a clean no-op on
# nodes where use_quadlet_backends is still false (today's default) —
# bats doesn't propagate setup-level skip semantics across @test blocks.
require_quadlet_backends() {
local count
count="$(backend_quadlet_units | wc -l)"
(( count > 0 )) || skip "no backend .container units in $(quadlet_dir) — use_quadlet_backends not enabled or no backends installed"
}
@test "Quadlet unit dir exists or is plausibly creatable" {
local d
d="$(quadlet_dir)"
# Either it already exists, or its parent does (so quadlet can mkdir it).
[[ -d "$d" ]] || [[ -d "$(dirname "$d")" ]] \
|| skip "no XDG_CONFIG_HOME and no \$HOME/.config — not a desktop-style host"
}
@test "each backend Quadlet unit has the required sections + directives" {
require_quadlet_backends
local d
d="$(quadlet_dir)"
while read -r name; do
[[ -z "$name" ]] && continue
local body
body="$(<"$d/$name.container")"
# [Container] section + Image=
[[ "$body" == *"[Container]"* ]] || fail "$name: missing [Container] section"
[[ "$body" == *"Image="* ]] || fail "$name: missing Image= directive"
# [Service] section with the Phase 3.2 backend invariant: Restart=on-failure.
# Companions use Restart=always; backends use on-failure so an operator-issued
# `systemctl stop` actually stays stopped.
[[ "$body" == *"[Service]"* ]] || fail "$name: missing [Service] section"
[[ "$body" == *"Restart=on-failure"* ]] \
|| fail "$name: backend unit must use Restart=on-failure (got companion-style Restart=always)"
# [Install] section so `systemctl --user enable` is well-defined.
[[ "$body" == *"[Install]"* ]] || fail "$name: missing [Install] section"
[[ "$body" == *"WantedBy="* ]] || fail "$name: missing WantedBy= in [Install]"
done < <(backend_quadlet_units)
}
@test "health is app-level state, NOT a systemd start gate (no Notify=healthy)" {
require_quadlet_backends
# Phase 3.4 originally emitted Notify=healthy so `systemctl start` blocked
# until the healthcheck passed. That was deliberately reverted: gating start
# on health hung boot reconciliation for dependency-waiting apps (fedimint
# idles its entrypoint until Bitcoin IBD finishes; lnd until the macaroon
# unlocks), leaving units stuck in "deactivating". The renderer now emits
# HealthCmd= for Podman's health state but TimeoutStartSec=0 and NO
# Notify=healthy (see quadlet.rs render() + contains_stale_health_gate()).
# This asserts the current invariant: no backend unit gates start on health.
local d
d="$(quadlet_dir)"
while read -r name; do
[[ -z "$name" ]] && continue
local body
body="$(<"$d/$name.container")"
[[ "$body" != *"Notify=healthy"* ]] \
|| fail "$name: emits Notify=healthy — stale health gate; start would block on health and can hang boot reconcile"
done < <(backend_quadlet_units)
}
@test "every backend Quadlet unit's .service is active in systemctl --user" {
require_quadlet_backends
while read -r name; do
[[ -z "$name" ]] && continue
run systemctl --user is-active "$name.service"
[[ "$status" -eq 0 ]] || fail "$name.service is '$output' — expected 'active'"
done < <(backend_quadlet_units)
}
@test "every backend Quadlet unit has a running podman container" {
require_quadlet_backends
while read -r name; do
[[ -z "$name" ]] && continue
run sh -c "podman inspect --format '{{.State.Running}}' '$name'"
[[ "$status" -eq 0 ]] || fail "$name not present in podman"
[[ "$output" == "true" ]] || fail "$name container exists but not running (state=$output)"
done < <(backend_quadlet_units)
}
@test "FM3 fix: backend cgroup is under user.slice, not archipelago.service" {
require_quadlet_backends
# The whole point of Phase 3 — verify the kernel-level invariant.
while read -r name; do
[[ -z "$name" ]] && continue
local cg
cg="$(container_cgroup_path "$name")" || skip "$name has no readable PID; container may have crashed mid-test"
[[ -n "$cg" ]] || fail "$name: empty cgroup path"
# Acceptable: anything under user.slice (rootless podman lands here when
# quadlet-managed). Forbidden: anything under archipelago.service's tree.
[[ "$cg" == *"user.slice"* ]] \
|| fail "$name: cgroup '$cg' is not under user.slice — FM3 cascade still possible"
[[ "$cg" != *"archipelago.service"* ]] \
|| fail "$name: cgroup '$cg' is under archipelago.service — Phase 3 promise broken"
done < <(backend_quadlet_units)
}