From 3214d6aff393ea058b918c7f20deea018d2df02f Mon Sep 17 00:00:00 2001 From: archipelago Date: Sun, 14 Jun 2026 13:35:36 -0400 Subject: [PATCH] fix(lnd): self-heal unrecoverable locked wallet via wipe+recreate When an existing LND wallet is locked and none of the candidate passwords (per-node secret, legacy constant) open it, the node can never auto-unlock unattended. unlock_existing_wallet now returns Ok(false) for "all candidates actively rejected" (vs Err for transient "LND not ready"), and ensure_wallet_initialized responds by recreating the wallet: - mark the lnd container user-stopped so the health monitor won't re-launch it (and re-open the wallet) mid-wipe, - stop lnd, delete its wallet/chain/graph state as root, - start lnd, wait for NON_EXISTING, re-init a fresh wallet on the per-node secret, then clear the user-stopped flag. LND runs as a plain bridge-network podman container (not a Quadlet unit), so it is restarted via `systemd-run --user --scope podman`, matching the orchestrator/health-monitor path. Alpha nodes hold no funds and a wallet locked with an unknown password is already inaccessible, so the wipe loses nothing reachable. Completes the forward fix from 91adc281 for nodes whose wallet pre-dates the per-node secret and whose password is unrecorded (e.g. .116/.228). Co-Authored-By: Claude Opus 4.8 (1M context) --- core/archipelago/src/container/lnd.rs | 137 +++++++++++++++++++++++--- 1 file changed, 126 insertions(+), 11 deletions(-) diff --git a/core/archipelago/src/container/lnd.rs b/core/archipelago/src/container/lnd.rs index e29ee1c0..3639454d 100644 --- a/core/archipelago/src/container/lnd.rs +++ b/core/archipelago/src/container/lnd.rs @@ -88,15 +88,125 @@ pub async fn ensure_wallet_initialized() -> Result<()> { if file_exists_as_root(admin_macaroon).await && lnd_getinfo_ready(admin_macaroon).await { return Ok(()); } - unlock_existing_wallet().await?; - wait_for_admin_macaroon(admin_macaroon).await?; - return Ok(()); + match unlock_existing_wallet().await? { + true => { + wait_for_admin_macaroon(admin_macaroon).await?; + return Ok(()); + } + false => { + // Every candidate password was actively rejected: this wallet was + // created with a password this node no longer has, so it can never + // auto-unlock unattended. Alpha nodes hold no real funds and a wallet + // locked with an unknown password is already inaccessible, so wipe + + // recreate it on the per-node secret to self-heal at boot. + recreate_wallet_destructively().await?; + wait_for_admin_macaroon(admin_macaroon).await?; + return Ok(()); + } + } } init_wallet_via_rest().await?; wait_for_admin_macaroon(admin_macaroon).await } +/// LND data subdirectories holding wallet + channel + graph state. Removing them +/// returns LND to a NON_EXISTING wallet state. Funds-bearing data lives here too, +/// so deletion is destructive — only done once the wallet is already unrecoverable. +const LND_STATE_DIRS: &[&str] = &[ + "/var/lib/archipelago/lnd/data/chain", + "/var/lib/archipelago/lnd/data/graph", +]; + +/// Podman container name for the core LND app (see `compute_container_name`: +/// non-UI core apps keep their bare id). LND runs as a plain bridge-network +/// container, not a Quadlet unit, so it is restarted via `podman`, not systemctl. +const LND_CONTAINER: &str = "lnd"; + +/// Archipelago data dir (default; not overridden in prod). Holds the +/// `user-stopped.json` that gates health-monitor auto-restart. +const ARCHY_DATA_DIR: &str = "/var/lib/archipelago"; + +/// Destroy an unrecoverable LND wallet and recreate a fresh one keyed to the +/// per-node secret. Suppresses health-monitor auto-restart for the wipe window, +/// stops LND, deletes its wallet/chain/graph state as root, restarts it, waits +/// for NON_EXISTING, then inits a fresh wallet. Destructive — only called when no +/// candidate password can open the existing wallet. +async fn recreate_wallet_destructively() -> Result<()> { + tracing::warn!( + "[lnd] wallet is locked with an unknown password and cannot auto-unlock; \ + wiping and recreating it on the per-node secret (DESTRUCTIVE)" + ); + + // The health monitor restarts any container it sees stopped; mark LND + // user-stopped so it doesn't re-launch (and re-open the wallet) mid-wipe. + // Always cleared below so LND auto-recovers normally afterwards. + let data_dir = std::path::Path::new(ARCHY_DATA_DIR); + crate::crash_recovery::mark_user_stopped(data_dir, LND_CONTAINER).await; + let result = wipe_and_reinit_wallet().await; + crate::crash_recovery::clear_user_stopped(data_dir, LND_CONTAINER).await; + result +} + +async fn wipe_and_reinit_wallet() -> Result<()> { + podman_user_scoped(&["stop", LND_CONTAINER]) + .await + .context("stopping lnd before wallet wipe")?; + + for dir in LND_STATE_DIRS { + let status = host_sudo(&["rm", "-rf", dir]) + .await + .with_context(|| format!("removing {dir}"))?; + if !status.success() { + anyhow::bail!("removing {dir} exited with {status}"); + } + } + + podman_user_scoped(&["start", LND_CONTAINER]) + .await + .context("restarting lnd after wallet wipe")?; + + wait_for_wallet_state("NON_EXISTING").await?; + init_wallet_via_rest().await +} + +/// Run `podman ` inside a transient `systemd-run --user --scope`, matching +/// how the orchestrator/health-monitor manage rootless containers (keeps the +/// container out of the archipelago service's cgroup). +async fn podman_user_scoped(args: &[&str]) -> Result<()> { + let out = tokio::process::Command::new("systemd-run") + .args(["--user", "--scope", "--quiet", "--collect", "podman"]) + .args(args) + .output() + .await + .with_context(|| format!("systemd-run --user --scope podman {}", args.join(" ")))?; + if !out.status.success() { + anyhow::bail!( + "podman {} failed: {}", + args.join(" "), + String::from_utf8_lossy(&out.stderr).trim() + ); + } + Ok(()) +} + +/// Poll `/v1/state` until LND reports `target`, or time out after ~120s. +async fn wait_for_wallet_state(target: &str) -> Result<()> { + let client = reqwest::Client::builder() + .no_proxy() + .timeout(std::time::Duration::from_secs(5)) + .danger_accept_invalid_certs(true) + .build() + .context("building LND REST client")?; + for _ in 0..120 { + if wallet_state(&client).await.as_deref() == Some(target) { + return Ok(()); + } + tokio::time::sleep(std::time::Duration::from_secs(1)).await; + } + anyhow::bail!("LND did not reach state {target} after wallet wipe") +} + async fn file_exists_as_root(path: &str) -> bool { if std::path::Path::new(path).exists() { return true; @@ -212,11 +322,14 @@ async fn try_unlock_once(client: &reqwest::Client, password: &str) -> UnlockAtte } } -async fn unlock_existing_wallet() -> Result<()> { +/// Unlock an existing wallet. Ok(true) = unlocked; Ok(false) = every candidate +/// password was actively rejected (unrecoverable — caller should recreate); +/// Err = transient (LND not ready / timeout — caller should retry, NOT wipe). +async fn unlock_existing_wallet() -> Result { unlock_existing_wallet_via_rest().await } -async fn unlock_existing_wallet_via_rest() -> Result<()> { +async fn unlock_existing_wallet_via_rest() -> Result { let client = reqwest::Client::builder() .no_proxy() .timeout(std::time::Duration::from_secs(20)) @@ -233,18 +346,18 @@ async fn unlock_existing_wallet_via_rest() -> Result<()> { let mut all_rejected = true; for pw in &candidates { match try_unlock_once(&client, pw).await { - UnlockAttempt::Unlocked => return Ok(()), + UnlockAttempt::Unlocked => return Ok(true), UnlockAttempt::WrongPassword => {} UnlockAttempt::NotReady => all_rejected = false, } } if all_rejected { - anyhow::bail!( - "LND wallet unlock failed: none of the {} candidate password(s) were accepted \ - — the wallet was created with a password this node does not have; \ - user-assisted migration or seed-recovery is required", + tracing::warn!( + "[lnd] none of the {} candidate password(s) unlock the wallet — it was created \ + with a password this node does not have", candidates.len() ); + return Ok(false); } tokio::time::sleep(std::time::Duration::from_secs(1)).await; } @@ -401,7 +514,9 @@ async fn init_wallet_via_rest() -> Result<()> { .context("initializing LND wallet")? { UnlockerResponse::Value(_) => {} - UnlockerResponse::WalletAlreadyExists => unlock_existing_wallet().await?, + UnlockerResponse::WalletAlreadyExists => { + unlock_existing_wallet().await?; + } } Ok(())