fix: bulletproof first-boot container creation and install reliability

Remove the Bitcoin RPC 60-second gate that blocked 13+ dependent containers
(mempool, electrumx, btcpay, lnd, fedimint) from being created on first boot.
Containers now always get created and auto-restart via health monitor once
Bitcoin becomes responsive — the designed recovery path.

Additional hardening:
- Validate archy-net creation with retry (silent failure broke DNS)
- Verify critical images are loaded, re-load from tarballs if missing
- Create SearXNG settings.yml before container start (was missing)
- Run reconciler automatically after first-boot failures
- Add load-images as explicit systemd dependency with 900s timeout
- Propagate config write errors in install.rs (bitcoin.conf, lnd.conf)
- FileBrowser password change: retry loop (6 attempts) + 0o600 perms
- Post-start verification: detect containers that exit immediately
- Add 2s dependency waits between container starts

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dorian 2026-03-31 18:31:00 +01:00
parent a896ecd431
commit 08f7f58a9d
5 changed files with 239 additions and 68 deletions

View File

@ -222,12 +222,12 @@ impl RpcHandler {
// Pre-install: bitcoin.conf with rpcauth
if matches!(package_id, "bitcoin" | "bitcoin-core" | "bitcoin-knots") {
self.write_bitcoin_conf(&rpc_user, &rpc_pass).await;
self.write_bitcoin_conf(&rpc_user, &rpc_pass).await?;
}
// Pre-install: lnd.conf with Bitcoin RPC credentials
if package_id == "lnd" {
self.write_lnd_conf(&rpc_user, &rpc_pass).await;
self.write_lnd_conf(&rpc_user, &rpc_pass).await?;
}
// Pre-install: SearXNG settings.yml (required or container exits immediately)
@ -241,7 +241,12 @@ impl RpcHandler {
"use_default_settings: true\ngeneral:\n instance_name: Archipelago Search\nserver:\n secret_key: \"{}\"\n bind_address: \"0.0.0.0\"\n port: 8080\n limiter: false\nui:\n default_theme: simple\n",
secret_hex
);
let _ = tokio::fs::write(&settings_path, settings).await;
tokio::fs::create_dir_all(searx_dir)
.await
.context("Failed to create SearXNG config directory")?;
tokio::fs::write(&settings_path, settings)
.await
.context("Failed to write SearXNG settings.yml")?;
info!("Created SearXNG settings.yml");
}
}
@ -580,7 +585,7 @@ impl RpcHandler {
}
/// Write bitcoin.conf with rpcauth (salted HMAC hash, no plaintext password).
async fn write_bitcoin_conf(&self, rpc_user: &str, rpc_pass: &str) {
async fn write_bitcoin_conf(&self, rpc_user: &str, rpc_pass: &str) -> Result<()> {
let bitcoin_dir = "/var/lib/archipelago/bitcoin";
let conf_path = format!("{}/bitcoin.conf", bitcoin_dir);
@ -607,20 +612,25 @@ listen=1\n\
printtoconsole=1\n",
rpcauth_line
);
let _ = tokio::fs::create_dir_all(bitcoin_dir).await;
let _ = tokio::fs::write(&conf_path, bitcoin_conf).await;
tokio::fs::create_dir_all(bitcoin_dir)
.await
.context("Failed to create bitcoin data directory")?;
tokio::fs::write(&conf_path, bitcoin_conf)
.await
.context("Failed to write bitcoin.conf")?;
info!("Created bitcoin.conf with rpcauth (no plaintext credentials)");
Ok(())
}
/// Write LND config file with Bitcoin RPC credentials.
async fn write_lnd_conf(&self, rpc_user: &str, rpc_pass: &str) {
async fn write_lnd_conf(&self, rpc_user: &str, rpc_pass: &str) -> Result<()> {
let lnd_dir = "/var/lib/archipelago/lnd";
let conf_path = format!("{}/lnd.conf", lnd_dir);
// Don't overwrite existing config (user may have customized it)
if tokio::fs::try_exists(&conf_path).await.unwrap_or(false) {
info!("lnd.conf already exists, skipping write");
return;
return Ok(());
}
let lnd_conf = format!(
@ -648,24 +658,25 @@ autopilot.active=false\n",
user = rpc_user,
pass = rpc_pass,
);
let _ = tokio::fs::create_dir_all(lnd_dir).await;
let _ = tokio::fs::write(&conf_path, lnd_conf).await;
tokio::fs::create_dir_all(lnd_dir)
.await
.context("Failed to create LND data directory")?;
tokio::fs::write(&conf_path, lnd_conf)
.await
.context("Failed to write lnd.conf")?;
info!("Created lnd.conf with Bitcoin RPC credentials");
Ok(())
}
/// Run post-install hooks (Nextcloud trusted domains, Bitcoin UI container).
/// Critical hooks (credential setup, config) are awaited; UI container builds are background.
async fn run_post_install_hooks(&self, package_id: &str) {
if package_id == "filebrowser" {
// Wait for filebrowser to start and initialize its database
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
// Generate a random password (32 bytes, hex-encoded)
let mut buf = [0u8; 32];
rand::RngCore::fill_bytes(&mut rand::rngs::OsRng, &mut buf);
let password = hex::encode(buf);
// Get a JWT token with default credentials
let client = match reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(10))
.build()
@ -677,53 +688,72 @@ autopilot.active=false\n",
}
};
let login_res = client
.post("http://127.0.0.1:8083/api/login")
.json(&serde_json::json!({"username": "admin", "password": "admin"}))
.send()
.await;
// Retry loop: FileBrowser may take time to initialize its SQLite database
let mut password_changed = false;
for attempt in 0..6u32 {
let delay = if attempt == 0 { 5 } else { 10 };
tokio::time::sleep(std::time::Duration::from_secs(delay)).await;
let token = match login_res {
Ok(resp) if resp.status().is_success() => {
match resp.text().await {
Ok(t) => t.trim_matches('"').to_string(),
Err(e) => {
tracing::warn!("FileBrowser login response parse failed: {}", e);
return;
// Try to log in with default credentials
let login_res = client
.post("http://127.0.0.1:8083/api/login")
.json(&serde_json::json!({"username": "admin", "password": "admin"}))
.send()
.await;
let token = match login_res {
Ok(resp) if resp.status().is_success() => {
match resp.text().await {
Ok(t) => t.trim_matches('"').to_string(),
Err(_) => continue,
}
}
}
_ => {
tracing::warn!("FileBrowser not ready for password change — keeping default");
return;
}
};
_ => {
debug!("FileBrowser not ready (attempt {}/6)", attempt + 1);
continue;
}
};
// Change admin password via filebrowser API
let change_res = client
.put("http://127.0.0.1:8083/api/users/1")
.header("X-Auth", &token)
.json(&serde_json::json!({"password": password}))
.send()
.await;
// Change admin password
let change_res = client
.put("http://127.0.0.1:8083/api/users/1")
.header("X-Auth", &token)
.json(&serde_json::json!({"password": password}))
.send()
.await;
match change_res {
Ok(resp) if resp.status().is_success() => {
let secret_dir = "/var/lib/archipelago/secrets/filebrowser";
let _ = tokio::fs::create_dir_all(secret_dir).await;
let _ = tokio::fs::write(
format!("{}/password", secret_dir),
&password,
).await;
info!("FileBrowser admin password secured (default credentials replaced)");
}
Ok(resp) => {
tracing::warn!("FileBrowser password change failed: {}", resp.status());
}
Err(e) => {
tracing::warn!("FileBrowser password change error: {}", e);
match change_res {
Ok(resp) if resp.status().is_success() => {
let secret_dir = "/var/lib/archipelago/secrets/filebrowser";
if let Err(e) = tokio::fs::create_dir_all(secret_dir).await {
tracing::warn!("Failed to create filebrowser secrets dir: {}", e);
}
let pw_path = format!("{}/password", secret_dir);
if let Err(e) = tokio::fs::write(&pw_path, &password).await {
tracing::warn!("Failed to write filebrowser password: {}", e);
}
// Set restrictive permissions on the password file
#[cfg(unix)]
{
use std::os::unix::fs::PermissionsExt;
let _ = std::fs::set_permissions(
&pw_path,
std::fs::Permissions::from_mode(0o600),
);
}
info!("FileBrowser admin password secured (default credentials replaced)");
password_changed = true;
break;
}
_ => continue,
}
}
if !password_changed {
tracing::warn!(
"FileBrowser password could not be changed after 6 attempts — \
default credentials (admin/admin) remain active"
);
}
}
if package_id == "nextcloud" {

View File

@ -48,7 +48,11 @@ impl RpcHandler {
install_log(&format!("START: {} (containers: {:?})", package_id, to_start)).await;
let mut errors = Vec::new();
for name in &to_start {
for (i, name) in to_start.iter().enumerate() {
// Brief delay between dependent containers to allow initialization
if i > 0 {
tokio::time::sleep(std::time::Duration::from_secs(2)).await;
}
tracing::info!("Starting container: {}", name);
let out = tokio::process::Command::new("podman")
.args(["start", name])
@ -66,6 +70,45 @@ impl RpcHandler {
if !errors.is_empty() {
return Err(anyhow::anyhow!("Start failed: {}", errors.join("; ")));
}
// Verify containers actually reached running state (podman start can
// succeed even if the container exits immediately after)
tokio::time::sleep(std::time::Duration::from_secs(3)).await;
for name in &to_start {
let status = tokio::process::Command::new("podman")
.args(["inspect", name, "--format", "{{.State.Status}}"])
.output()
.await;
if let Ok(o) = status {
let state = String::from_utf8_lossy(&o.stdout).trim().to_string();
if state == "exited" {
let logs = tokio::process::Command::new("podman")
.args(["logs", "--tail", "5", name])
.output()
.await;
let log_text = logs
.map(|o| {
let combined = format!(
"{}{}",
String::from_utf8_lossy(&o.stdout),
String::from_utf8_lossy(&o.stderr)
);
combined.chars().take(200).collect::<String>()
})
.unwrap_or_default();
tracing::error!("Container {} exited after start: {}", name, log_text);
install_log(&format!("START EXITED: {}{}", name, log_text)).await;
errors.push(format!("{}: exited after start", name));
}
}
}
if !errors.is_empty() {
return Err(anyhow::anyhow!(
"Containers exited after start: {}",
errors.join("; ")
));
}
Ok(serde_json::Value::Null)
}

View File

@ -1368,12 +1368,14 @@ FBUNBUNDLED
cat > "$WORK_DIR/archipelago-first-boot-containers.service" <<'FBCSERVICE'
[Unit]
Description=Create core Archipelago containers on first boot
After=archipelago-setup-tor.service network-online.target podman.service
After=archipelago-load-images.service archipelago-setup-tor.service network-online.target podman.service
Wants=archipelago-load-images.service
ConditionPathExists=/opt/archipelago/scripts/first-boot-containers.sh
ConditionPathExists=!/var/lib/archipelago/.first-boot-containers-done
[Service]
Type=oneshot
TimeoutStartSec=900
ExecStart=/opt/archipelago/scripts/first-boot-containers.sh
ExecStartPost=/usr/bin/touch /var/lib/archipelago/.first-boot-containers-done
RemainAfterExit=yes
@ -1395,12 +1397,14 @@ else
cat > "$WORK_DIR/archipelago-first-boot-containers.service" <<'FBCSERVICE'
[Unit]
Description=Create core Archipelago containers on first boot
After=archipelago-setup-tor.service network-online.target podman.service
After=archipelago-load-images.service archipelago-setup-tor.service network-online.target podman.service
Wants=archipelago-load-images.service
ConditionPathExists=/opt/archipelago/scripts/first-boot-containers.sh
ConditionPathExists=!/var/lib/archipelago/.first-boot-containers-done
[Service]
Type=oneshot
TimeoutStartSec=900
ExecStart=/opt/archipelago/scripts/first-boot-containers.sh
ExecStartPost=/usr/bin/touch /var/lib/archipelago/.first-boot-containers-done
RemainAfterExit=yes

View File

@ -412,11 +412,13 @@ load_spec_searxng() {
SPEC_IMAGE="${SEARXNG_IMAGE}"
SPEC_PORTS="8888:8080"
SPEC_MEMORY="$(mem_limit searxng)"
SPEC_VOLUMES="/var/lib/archipelago/searxng:/etc/searxng"
SPEC_HEALTH_CMD="curl -sf http://localhost:8080/ || exit 1"
SPEC_READONLY="true"
SPEC_TMPFS="/tmp:rw,noexec,nosuid,size=256m /run:rw,noexec,nosuid,size=64m /etc/searxng:rw,noexec,nosuid,size=16m"
SPEC_TMPFS="/tmp:rw,noexec,nosuid,size=256m /run:rw,noexec,nosuid,size=64m"
SPEC_TIER="3"
SPEC_CAPS=""
SPEC_DATA_DIR="/var/lib/archipelago/searxng"
}
load_spec_onlyoffice() {

View File

@ -233,8 +233,19 @@ chmod 700 /run/user/1000
runuser -u archipelago -- env XDG_RUNTIME_DIR=/run/user/1000 \
systemctl --user start podman.socket 2>/dev/null || true
# Ensure network exists (matches deploy)
# Ensure archy-net exists — critical for inter-container DNS (mempool→bitcoin, etc.)
$DOCKER network create archy-net 2>/dev/null || true
if ! $DOCKER network exists archy-net 2>/dev/null; then
log "WARNING: archy-net creation failed, retrying in 5s..."
sleep 5
$DOCKER network create archy-net 2>>"$LOG"
if ! $DOCKER network exists archy-net 2>/dev/null; then
log "FATAL: Cannot create archy-net — inter-container DNS will not work."
log " All containers requiring archy-net will fail. Exiting."
exit 1
fi
fi
log "archy-net network ready"
# Rootless podman UID mapping: fix data dir ownership so container processes
# can write. Rootless podman maps container UIDs via subuid (container UID N
@ -299,6 +310,43 @@ mem_limit() {
esac
}
# ── Verify critical images are loaded ──────────────────────────────────
# archipelago-load-images.service should have loaded these from tarballs.
# If any are missing (corrupt tarball, disk full, etc.), try re-loading.
log "Verifying container images..."
MISSING_IMAGES=""
for img_var in BITCOIN_KNOTS_IMAGE MARIADB_IMAGE ELECTRUMX_IMAGE \
MEMPOOL_BACKEND_IMAGE MEMPOOL_WEB_IMAGE BTCPAY_POSTGRES_IMAGE \
NBXPLORER_IMAGE BTCPAY_IMAGE LND_IMAGE FEDIMINT_IMAGE \
FEDIMINT_GATEWAY_IMAGE HOMEASSISTANT_IMAGE GRAFANA_IMAGE \
UPTIME_KUMA_IMAGE JELLYFIN_IMAGE VAULTWARDEN_IMAGE \
NEXTCLOUD_IMAGE SEARXNG_IMAGE FILEBROWSER_IMAGE; do
img="${!img_var}"
if [ -z "$img" ]; then
continue # Variable not defined in image-versions.sh
fi
if ! $DOCKER images --format '{{.Repository}}:{{.Tag}}' 2>/dev/null | grep -qF "$img"; then
MISSING_IMAGES="$MISSING_IMAGES $img_var"
fi
done
if [ -n "$MISSING_IMAGES" ]; then
log "WARNING: Missing images:$MISSING_IMAGES"
log "Attempting to re-load from /opt/archipelago/container-images/..."
RELOAD_COUNT=0
for tarfile in /opt/archipelago/container-images/*.tar; do
if [ -f "$tarfile" ]; then
if $DOCKER load -i "$tarfile" 2>>"$LOG"; then
RELOAD_COUNT=$((RELOAD_COUNT + 1))
else
log " Failed to load: $tarfile"
fi
fi
done
log "Re-loaded $RELOAD_COUNT image tarballs"
else
log "All critical images verified"
fi
# ── Tier 1: Databases & Core Infrastructure ──────────────────────────────
log "=== Tier 1: Databases & Core Infrastructure ==="
@ -337,13 +385,16 @@ else
$DOCKER network connect archy-net bitcoin-knots 2>/dev/null || true
log "Bitcoin Knots already running"
fi
# Wait for Bitcoin Knots RPC to be responsive
# Check Bitcoin Knots RPC (informational — containers created regardless)
# Dependent containers use --restart=unless-stopped and the health monitor
# will auto-restart them once Bitcoin becomes responsive.
if wait_for_container "Bitcoin Knots RPC" "$DOCKER exec bitcoin-knots bitcoin-cli -rpcuser='$BITCOIN_RPC_USER' -rpcpassword='$BITCOIN_RPC_PASS' getblockchaininfo" 60; then
BITCOIN_READY=true
log "Bitcoin Knots is ready — dependent containers will proceed"
log "Bitcoin Knots is ready"
else
BITCOIN_READY=false
log "WARNING: Bitcoin Knots NOT ready — skipping dependent containers (electrumx, lnd, mempool, btcpay, fedimint)"
log "Bitcoin Knots not yet responsive (normal during IBD) — creating dependent containers anyway"
log " They will auto-restart via health monitor once Bitcoin is ready"
fi
track_container "bitcoin-knots"
@ -355,7 +406,8 @@ if ! $DOCKER exec bitcoin-knots bitcoin-cli "-rpcuser=$BITCOIN_RPC_USER" "-rpcpa
fi
# 2. Mempool stack (matches deploy) — depends on Bitcoin
if [ "$BITCOIN_READY" = "true" ]; then
# Note: containers created regardless of BITCOIN_READY — they will restart
# automatically once Bitcoin becomes responsive (--restart=unless-stopped).
if ! $DOCKER ps -a --format '{{.Names}}' 2>/dev/null | grep -qE 'archy-mempool-db|mysql-mempool'; then
log "Creating mysql-mempool..."
mkdir -p /var/lib/archipelago/mysql-mempool
@ -624,9 +676,7 @@ if ! $DOCKER ps --format '{{.Names}}' 2>/dev/null | grep -q fedimint-gateway; th
fi
track_container "fedimint-gateway"
else
log "SKIPPED: mempool stack, electrumx, btcpay stack, lnd, fedimint (Bitcoin not ready)"
fi # end BITCOIN_READY
# (Bitcoin-dependent containers created above regardless of BITCOIN_READY)
# ── Tier 3: Applications (independent — always attempt) ───────────────────
log "=== Tier 3: Applications ==="
@ -742,12 +792,33 @@ fi
track_container "nextcloud"
if ! $DOCKER ps --format '{{.Names}}' 2>/dev/null | grep -q searxng; then
log "Creating SearXNG..."
# SearXNG requires settings.yml or it exits immediately
SEARXNG_CONF="/var/lib/archipelago/searxng"
if [ ! -f "$SEARXNG_CONF/settings.yml" ]; then
mkdir -p "$SEARXNG_CONF"
SEARX_SECRET=$(openssl rand -hex 32)
cat > "$SEARXNG_CONF/settings.yml" <<SEARXCFG
use_default_settings: true
general:
instance_name: Archipelago Search
server:
secret_key: "$SEARX_SECRET"
bind_address: "0.0.0.0"
port: 8080
limiter: false
ui:
default_theme: simple
SEARXCFG
chown -R 100000:100000 "$SEARXNG_CONF" 2>/dev/null
log " Created SearXNG settings.yml"
fi
$DOCKER run -d --name searxng --restart unless-stopped \
--health-cmd="curl -sf http://localhost:8080/ || exit 1" --health-interval=120s --health-timeout=5s --health-retries=3 \
--memory=$(mem_limit searxng) \
--cap-drop ALL --security-opt no-new-privileges:true \
--read-only --tmpfs /tmp:rw,noexec,nosuid,size=256m --tmpfs /run:rw,noexec,nosuid,size=64m \
-p 8888:8080 \
-v /var/lib/archipelago/searxng:/etc/searxng \
"${SEARXNG_IMAGE}" 2>>"$LOG" || true
fi
track_container "searxng"
@ -979,8 +1050,29 @@ elif [ -x "/opt/archipelago/scripts/container-doctor.sh" ]; then
bash "/opt/archipelago/scripts/container-doctor.sh" --local 2>&1 | tee -a "$LOG"
fi
# 12. Final summary
# 11b. If any containers failed, run the reconciler to attempt recovery
FAILED=$((TOTAL - SUCCESS))
if [ "$FAILED" -gt 0 ]; then
log "Attempting to recover $FAILED failed container(s) via reconciler..."
RECONCILE_SCRIPT=""
if [ -x "$SCRIPT_DIR/reconcile-containers.sh" ]; then
RECONCILE_SCRIPT="$SCRIPT_DIR/reconcile-containers.sh"
elif [ -x "/opt/archipelago/scripts/reconcile-containers.sh" ]; then
RECONCILE_SCRIPT="/opt/archipelago/scripts/reconcile-containers.sh"
fi
if [ -n "$RECONCILE_SCRIPT" ]; then
runuser -u archipelago -- bash "$RECONCILE_SCRIPT" 2>&1 | tee -a "$LOG"
# Recount after reconciliation
SUCCESS=0
for name in $($DOCKER ps --format '{{.Names}}' 2>/dev/null); do
SUCCESS=$((SUCCESS + 1))
done
FAILED=$((TOTAL - SUCCESS))
log "After reconciliation: $SUCCESS running, $FAILED still failed"
fi
fi
# 12. Final summary
log "============================================="
log " FIRST-BOOT CONTAINER SUMMARY"
log "============================================="
@ -988,7 +1080,7 @@ log " Total tracked: $TOTAL"
log " Running: $SUCCESS"
log " Failed: $FAILED"
if [ "$BITCOIN_READY" != "true" ]; then
log " Bitcoin: NOT READY (dependent containers skipped)"
log " Bitcoin: NOT READY (dependent containers will auto-restart when ready)"
fi
if [ -n "$FAILED_LIST" ]; then
log " Failed list: $FAILED_LIST"