fix(lnd): self-heal unrecoverable locked wallet via wipe+recreate

When an existing LND wallet is locked and none of the candidate passwords
(per-node secret, legacy constant) open it, the node can never auto-unlock
unattended. unlock_existing_wallet now returns Ok(false) for "all candidates
actively rejected" (vs Err for transient "LND not ready"), and
ensure_wallet_initialized responds by recreating the wallet:

  - mark the lnd container user-stopped so the health monitor won't
    re-launch it (and re-open the wallet) mid-wipe,
  - stop lnd, delete its wallet/chain/graph state as root,
  - start lnd, wait for NON_EXISTING, re-init a fresh wallet on the
    per-node secret, then clear the user-stopped flag.

LND runs as a plain bridge-network podman container (not a Quadlet unit),
so it is restarted via `systemd-run --user --scope podman`, matching the
orchestrator/health-monitor path.

Alpha nodes hold no funds and a wallet locked with an unknown password is
already inaccessible, so the wipe loses nothing reachable. Completes the
forward fix from 91adc281 for nodes whose wallet pre-dates the per-node
secret and whose password is unrecorded (e.g. .116/.228).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
archipelago 2026-06-14 13:35:36 -04:00
parent 459046b21c
commit 3214d6aff3

View File

@ -88,15 +88,125 @@ pub async fn ensure_wallet_initialized() -> Result<()> {
if file_exists_as_root(admin_macaroon).await && lnd_getinfo_ready(admin_macaroon).await {
return Ok(());
}
unlock_existing_wallet().await?;
wait_for_admin_macaroon(admin_macaroon).await?;
return Ok(());
match unlock_existing_wallet().await? {
true => {
wait_for_admin_macaroon(admin_macaroon).await?;
return Ok(());
}
false => {
// Every candidate password was actively rejected: this wallet was
// created with a password this node no longer has, so it can never
// auto-unlock unattended. Alpha nodes hold no real funds and a wallet
// locked with an unknown password is already inaccessible, so wipe +
// recreate it on the per-node secret to self-heal at boot.
recreate_wallet_destructively().await?;
wait_for_admin_macaroon(admin_macaroon).await?;
return Ok(());
}
}
}
init_wallet_via_rest().await?;
wait_for_admin_macaroon(admin_macaroon).await
}
/// LND data subdirectories holding wallet + channel + graph state. Removing them
/// returns LND to a NON_EXISTING wallet state. Funds-bearing data lives here too,
/// so deletion is destructive — only done once the wallet is already unrecoverable.
const LND_STATE_DIRS: &[&str] = &[
"/var/lib/archipelago/lnd/data/chain",
"/var/lib/archipelago/lnd/data/graph",
];
/// Podman container name for the core LND app (see `compute_container_name`:
/// non-UI core apps keep their bare id). LND runs as a plain bridge-network
/// container, not a Quadlet unit, so it is restarted via `podman`, not systemctl.
const LND_CONTAINER: &str = "lnd";
/// Archipelago data dir (default; not overridden in prod). Holds the
/// `user-stopped.json` that gates health-monitor auto-restart.
const ARCHY_DATA_DIR: &str = "/var/lib/archipelago";
/// Destroy an unrecoverable LND wallet and recreate a fresh one keyed to the
/// per-node secret. Suppresses health-monitor auto-restart for the wipe window,
/// stops LND, deletes its wallet/chain/graph state as root, restarts it, waits
/// for NON_EXISTING, then inits a fresh wallet. Destructive — only called when no
/// candidate password can open the existing wallet.
async fn recreate_wallet_destructively() -> Result<()> {
tracing::warn!(
"[lnd] wallet is locked with an unknown password and cannot auto-unlock; \
wiping and recreating it on the per-node secret (DESTRUCTIVE)"
);
// The health monitor restarts any container it sees stopped; mark LND
// user-stopped so it doesn't re-launch (and re-open the wallet) mid-wipe.
// Always cleared below so LND auto-recovers normally afterwards.
let data_dir = std::path::Path::new(ARCHY_DATA_DIR);
crate::crash_recovery::mark_user_stopped(data_dir, LND_CONTAINER).await;
let result = wipe_and_reinit_wallet().await;
crate::crash_recovery::clear_user_stopped(data_dir, LND_CONTAINER).await;
result
}
async fn wipe_and_reinit_wallet() -> Result<()> {
podman_user_scoped(&["stop", LND_CONTAINER])
.await
.context("stopping lnd before wallet wipe")?;
for dir in LND_STATE_DIRS {
let status = host_sudo(&["rm", "-rf", dir])
.await
.with_context(|| format!("removing {dir}"))?;
if !status.success() {
anyhow::bail!("removing {dir} exited with {status}");
}
}
podman_user_scoped(&["start", LND_CONTAINER])
.await
.context("restarting lnd after wallet wipe")?;
wait_for_wallet_state("NON_EXISTING").await?;
init_wallet_via_rest().await
}
/// Run `podman <args>` inside a transient `systemd-run --user --scope`, matching
/// how the orchestrator/health-monitor manage rootless containers (keeps the
/// container out of the archipelago service's cgroup).
async fn podman_user_scoped(args: &[&str]) -> Result<()> {
let out = tokio::process::Command::new("systemd-run")
.args(["--user", "--scope", "--quiet", "--collect", "podman"])
.args(args)
.output()
.await
.with_context(|| format!("systemd-run --user --scope podman {}", args.join(" ")))?;
if !out.status.success() {
anyhow::bail!(
"podman {} failed: {}",
args.join(" "),
String::from_utf8_lossy(&out.stderr).trim()
);
}
Ok(())
}
/// Poll `/v1/state` until LND reports `target`, or time out after ~120s.
async fn wait_for_wallet_state(target: &str) -> Result<()> {
let client = reqwest::Client::builder()
.no_proxy()
.timeout(std::time::Duration::from_secs(5))
.danger_accept_invalid_certs(true)
.build()
.context("building LND REST client")?;
for _ in 0..120 {
if wallet_state(&client).await.as_deref() == Some(target) {
return Ok(());
}
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
}
anyhow::bail!("LND did not reach state {target} after wallet wipe")
}
async fn file_exists_as_root(path: &str) -> bool {
if std::path::Path::new(path).exists() {
return true;
@ -212,11 +322,14 @@ async fn try_unlock_once(client: &reqwest::Client, password: &str) -> UnlockAtte
}
}
async fn unlock_existing_wallet() -> Result<()> {
/// Unlock an existing wallet. Ok(true) = unlocked; Ok(false) = every candidate
/// password was actively rejected (unrecoverable — caller should recreate);
/// Err = transient (LND not ready / timeout — caller should retry, NOT wipe).
async fn unlock_existing_wallet() -> Result<bool> {
unlock_existing_wallet_via_rest().await
}
async fn unlock_existing_wallet_via_rest() -> Result<()> {
async fn unlock_existing_wallet_via_rest() -> Result<bool> {
let client = reqwest::Client::builder()
.no_proxy()
.timeout(std::time::Duration::from_secs(20))
@ -233,18 +346,18 @@ async fn unlock_existing_wallet_via_rest() -> Result<()> {
let mut all_rejected = true;
for pw in &candidates {
match try_unlock_once(&client, pw).await {
UnlockAttempt::Unlocked => return Ok(()),
UnlockAttempt::Unlocked => return Ok(true),
UnlockAttempt::WrongPassword => {}
UnlockAttempt::NotReady => all_rejected = false,
}
}
if all_rejected {
anyhow::bail!(
"LND wallet unlock failed: none of the {} candidate password(s) were accepted \
the wallet was created with a password this node does not have; \
user-assisted migration or seed-recovery is required",
tracing::warn!(
"[lnd] none of the {} candidate password(s) unlock the wallet — it was created \
with a password this node does not have",
candidates.len()
);
return Ok(false);
}
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
}
@ -401,7 +514,9 @@ async fn init_wallet_via_rest() -> Result<()> {
.context("initializing LND wallet")?
{
UnlockerResponse::Value(_) => {}
UnlockerResponse::WalletAlreadyExists => unlock_existing_wallet().await?,
UnlockerResponse::WalletAlreadyExists => {
unlock_existing_wallet().await?;
}
}
Ok(())