fix(lnd): self-heal unrecoverable locked wallet via wipe+recreate
When an existing LND wallet is locked and none of the candidate passwords
(per-node secret, legacy constant) open it, the node can never auto-unlock
unattended. unlock_existing_wallet now returns Ok(false) for "all candidates
actively rejected" (vs Err for transient "LND not ready"), and
ensure_wallet_initialized responds by recreating the wallet:
- mark the lnd container user-stopped so the health monitor won't
re-launch it (and re-open the wallet) mid-wipe,
- stop lnd, delete its wallet/chain/graph state as root,
- start lnd, wait for NON_EXISTING, re-init a fresh wallet on the
per-node secret, then clear the user-stopped flag.
LND runs as a plain bridge-network podman container (not a Quadlet unit),
so it is restarted via `systemd-run --user --scope podman`, matching the
orchestrator/health-monitor path.
Alpha nodes hold no funds and a wallet locked with an unknown password is
already inaccessible, so the wipe loses nothing reachable. Completes the
forward fix from 91adc281 for nodes whose wallet pre-dates the per-node
secret and whose password is unrecorded (e.g. .116/.228).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
459046b21c
commit
3214d6aff3
@ -88,15 +88,125 @@ pub async fn ensure_wallet_initialized() -> Result<()> {
|
||||
if file_exists_as_root(admin_macaroon).await && lnd_getinfo_ready(admin_macaroon).await {
|
||||
return Ok(());
|
||||
}
|
||||
unlock_existing_wallet().await?;
|
||||
wait_for_admin_macaroon(admin_macaroon).await?;
|
||||
return Ok(());
|
||||
match unlock_existing_wallet().await? {
|
||||
true => {
|
||||
wait_for_admin_macaroon(admin_macaroon).await?;
|
||||
return Ok(());
|
||||
}
|
||||
false => {
|
||||
// Every candidate password was actively rejected: this wallet was
|
||||
// created with a password this node no longer has, so it can never
|
||||
// auto-unlock unattended. Alpha nodes hold no real funds and a wallet
|
||||
// locked with an unknown password is already inaccessible, so wipe +
|
||||
// recreate it on the per-node secret to self-heal at boot.
|
||||
recreate_wallet_destructively().await?;
|
||||
wait_for_admin_macaroon(admin_macaroon).await?;
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
init_wallet_via_rest().await?;
|
||||
wait_for_admin_macaroon(admin_macaroon).await
|
||||
}
|
||||
|
||||
/// LND data subdirectories holding wallet + channel + graph state. Removing them
|
||||
/// returns LND to a NON_EXISTING wallet state. Funds-bearing data lives here too,
|
||||
/// so deletion is destructive — only done once the wallet is already unrecoverable.
|
||||
const LND_STATE_DIRS: &[&str] = &[
|
||||
"/var/lib/archipelago/lnd/data/chain",
|
||||
"/var/lib/archipelago/lnd/data/graph",
|
||||
];
|
||||
|
||||
/// Podman container name for the core LND app (see `compute_container_name`:
|
||||
/// non-UI core apps keep their bare id). LND runs as a plain bridge-network
|
||||
/// container, not a Quadlet unit, so it is restarted via `podman`, not systemctl.
|
||||
const LND_CONTAINER: &str = "lnd";
|
||||
|
||||
/// Archipelago data dir (default; not overridden in prod). Holds the
|
||||
/// `user-stopped.json` that gates health-monitor auto-restart.
|
||||
const ARCHY_DATA_DIR: &str = "/var/lib/archipelago";
|
||||
|
||||
/// Destroy an unrecoverable LND wallet and recreate a fresh one keyed to the
|
||||
/// per-node secret. Suppresses health-monitor auto-restart for the wipe window,
|
||||
/// stops LND, deletes its wallet/chain/graph state as root, restarts it, waits
|
||||
/// for NON_EXISTING, then inits a fresh wallet. Destructive — only called when no
|
||||
/// candidate password can open the existing wallet.
|
||||
async fn recreate_wallet_destructively() -> Result<()> {
|
||||
tracing::warn!(
|
||||
"[lnd] wallet is locked with an unknown password and cannot auto-unlock; \
|
||||
wiping and recreating it on the per-node secret (DESTRUCTIVE)"
|
||||
);
|
||||
|
||||
// The health monitor restarts any container it sees stopped; mark LND
|
||||
// user-stopped so it doesn't re-launch (and re-open the wallet) mid-wipe.
|
||||
// Always cleared below so LND auto-recovers normally afterwards.
|
||||
let data_dir = std::path::Path::new(ARCHY_DATA_DIR);
|
||||
crate::crash_recovery::mark_user_stopped(data_dir, LND_CONTAINER).await;
|
||||
let result = wipe_and_reinit_wallet().await;
|
||||
crate::crash_recovery::clear_user_stopped(data_dir, LND_CONTAINER).await;
|
||||
result
|
||||
}
|
||||
|
||||
async fn wipe_and_reinit_wallet() -> Result<()> {
|
||||
podman_user_scoped(&["stop", LND_CONTAINER])
|
||||
.await
|
||||
.context("stopping lnd before wallet wipe")?;
|
||||
|
||||
for dir in LND_STATE_DIRS {
|
||||
let status = host_sudo(&["rm", "-rf", dir])
|
||||
.await
|
||||
.with_context(|| format!("removing {dir}"))?;
|
||||
if !status.success() {
|
||||
anyhow::bail!("removing {dir} exited with {status}");
|
||||
}
|
||||
}
|
||||
|
||||
podman_user_scoped(&["start", LND_CONTAINER])
|
||||
.await
|
||||
.context("restarting lnd after wallet wipe")?;
|
||||
|
||||
wait_for_wallet_state("NON_EXISTING").await?;
|
||||
init_wallet_via_rest().await
|
||||
}
|
||||
|
||||
/// Run `podman <args>` inside a transient `systemd-run --user --scope`, matching
|
||||
/// how the orchestrator/health-monitor manage rootless containers (keeps the
|
||||
/// container out of the archipelago service's cgroup).
|
||||
async fn podman_user_scoped(args: &[&str]) -> Result<()> {
|
||||
let out = tokio::process::Command::new("systemd-run")
|
||||
.args(["--user", "--scope", "--quiet", "--collect", "podman"])
|
||||
.args(args)
|
||||
.output()
|
||||
.await
|
||||
.with_context(|| format!("systemd-run --user --scope podman {}", args.join(" ")))?;
|
||||
if !out.status.success() {
|
||||
anyhow::bail!(
|
||||
"podman {} failed: {}",
|
||||
args.join(" "),
|
||||
String::from_utf8_lossy(&out.stderr).trim()
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Poll `/v1/state` until LND reports `target`, or time out after ~120s.
|
||||
async fn wait_for_wallet_state(target: &str) -> Result<()> {
|
||||
let client = reqwest::Client::builder()
|
||||
.no_proxy()
|
||||
.timeout(std::time::Duration::from_secs(5))
|
||||
.danger_accept_invalid_certs(true)
|
||||
.build()
|
||||
.context("building LND REST client")?;
|
||||
for _ in 0..120 {
|
||||
if wallet_state(&client).await.as_deref() == Some(target) {
|
||||
return Ok(());
|
||||
}
|
||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
}
|
||||
anyhow::bail!("LND did not reach state {target} after wallet wipe")
|
||||
}
|
||||
|
||||
async fn file_exists_as_root(path: &str) -> bool {
|
||||
if std::path::Path::new(path).exists() {
|
||||
return true;
|
||||
@ -212,11 +322,14 @@ async fn try_unlock_once(client: &reqwest::Client, password: &str) -> UnlockAtte
|
||||
}
|
||||
}
|
||||
|
||||
async fn unlock_existing_wallet() -> Result<()> {
|
||||
/// Unlock an existing wallet. Ok(true) = unlocked; Ok(false) = every candidate
|
||||
/// password was actively rejected (unrecoverable — caller should recreate);
|
||||
/// Err = transient (LND not ready / timeout — caller should retry, NOT wipe).
|
||||
async fn unlock_existing_wallet() -> Result<bool> {
|
||||
unlock_existing_wallet_via_rest().await
|
||||
}
|
||||
|
||||
async fn unlock_existing_wallet_via_rest() -> Result<()> {
|
||||
async fn unlock_existing_wallet_via_rest() -> Result<bool> {
|
||||
let client = reqwest::Client::builder()
|
||||
.no_proxy()
|
||||
.timeout(std::time::Duration::from_secs(20))
|
||||
@ -233,18 +346,18 @@ async fn unlock_existing_wallet_via_rest() -> Result<()> {
|
||||
let mut all_rejected = true;
|
||||
for pw in &candidates {
|
||||
match try_unlock_once(&client, pw).await {
|
||||
UnlockAttempt::Unlocked => return Ok(()),
|
||||
UnlockAttempt::Unlocked => return Ok(true),
|
||||
UnlockAttempt::WrongPassword => {}
|
||||
UnlockAttempt::NotReady => all_rejected = false,
|
||||
}
|
||||
}
|
||||
if all_rejected {
|
||||
anyhow::bail!(
|
||||
"LND wallet unlock failed: none of the {} candidate password(s) were accepted \
|
||||
— the wallet was created with a password this node does not have; \
|
||||
user-assisted migration or seed-recovery is required",
|
||||
tracing::warn!(
|
||||
"[lnd] none of the {} candidate password(s) unlock the wallet — it was created \
|
||||
with a password this node does not have",
|
||||
candidates.len()
|
||||
);
|
||||
return Ok(false);
|
||||
}
|
||||
tokio::time::sleep(std::time::Duration::from_secs(1)).await;
|
||||
}
|
||||
@ -401,7 +514,9 @@ async fn init_wallet_via_rest() -> Result<()> {
|
||||
.context("initializing LND wallet")?
|
||||
{
|
||||
UnlockerResponse::Value(_) => {}
|
||||
UnlockerResponse::WalletAlreadyExists => unlock_existing_wallet().await?,
|
||||
UnlockerResponse::WalletAlreadyExists => {
|
||||
unlock_existing_wallet().await?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user