fix(federation): reliable symmetric auto-federation across LAN/Tor/FIPS

Federated nodes failed to converge to full-mesh across the LAN<->Tailscale
boundary: nodes were invisible to peers, sync 'took ages'/timed out, and
names only updated on a manual sync. Onions were healthy in both directions
(~3-5s); the failures were app-layer.

- B: federation dials fast-fail a dead FIPS path via .fips_timeout(6s) in
  sync_with_peer + notify_join, so the Tor fallback isn't stuck behind the
  full 30s FIPS budget when LAN and remote peers share no FIPS path.
- A: notify_join (peer-joined) now spawns with retries+backoff instead of a
  single awaited best-effort POST, so the join RPC returns instantly (no
  'Request timeout') and the inviter reliably learns the joiner (was
  asymmetric).
- C: new 90s periodic federation auto-sync (none existed) so renamed nodes
  and roster changes propagate without a manual Sync click.
- self-heal: each auto-sync re-asserts membership to any peer that doesn't
  list us back, converging the fleet to full-mesh and healing pre-existing
  asymmetry with no manual re-joins.

Validated live across 7 nodes: a previously fleet-invisible node became
fully meshed automatically (logs: 'auto-sync ... reasserted=1',
'peer-joined ... delivered').

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
archipelago 2026-06-19 09:50:10 -04:00
parent 1bce694ebb
commit 837cc02812
4 changed files with 153 additions and 5 deletions

View File

@ -254,11 +254,57 @@ pub(crate) async fn notify_join(
"params": params,
});
let _ = crate::fips::dial::PeerRequest::new(remote_fips_npub, remote_onion, "/rpc/v1")
.service(crate::settings::transport::PeerService::Federation)
.timeout(std::time::Duration::from_secs(30))
.send_json(&body)
.await;
// Deliver the notification in the BACKGROUND with retries, and return
// immediately. Two reasons:
// 1. The join RPC must not block on this. Awaiting a cold FIPS overlay
// (no shared FIPS path between LAN and remote/Tailscale peers) stalled
// the whole join until FIPS timed out, surfacing as "Request timeout"
// in the UI even though the local membership was already saved.
// 2. If this single best-effort POST failed, the inviter never learned
// about us → asymmetric federation (they couldn't see us). Retrying in
// the background until it lands makes federation converge to symmetric.
// `fips_timeout` fast-fails a dead FIPS path so the Tor fallback (which
// answers an onion in ~3-5s) is reached quickly on each attempt.
let remote_onion = remote_onion.to_string();
let remote_fips_npub = remote_fips_npub.map(|s| s.to_string());
tokio::spawn(async move {
// ~5 attempts with linear backoff: 0s, 10s, 20s, 30s, 40s — covers a
// peer that is briefly unreachable (restarting, publishing its onion)
// without hammering it.
for attempt in 1..=5u32 {
let res = crate::fips::dial::PeerRequest::new(
remote_fips_npub.as_deref(),
&remote_onion,
"/rpc/v1",
)
.service(crate::settings::transport::PeerService::Federation)
.timeout(std::time::Duration::from_secs(30))
.fips_timeout(std::time::Duration::from_secs(6))
.send_json(&body)
.await;
match res {
Ok((resp, transport)) if resp.status().is_success() => {
tracing::info!(
attempt,
transport = %transport,
"peer-joined notification delivered to inviter"
);
return;
}
Ok((resp, _)) => tracing::warn!(
attempt,
status = %resp.status(),
"peer-joined notification rejected; will retry"
),
Err(e) => tracing::warn!(attempt, error = %e, "peer-joined notification failed; will retry"),
}
tokio::time::sleep(std::time::Duration::from_secs(10 * attempt as u64)).await;
}
tracing::warn!(
onion = %remote_onion,
"peer-joined notification gave up after retries — peer may not see us until next sync"
);
});
Ok(())
}

View File

@ -12,6 +12,9 @@ mod types;
// Re-export all public items so `crate::federation::*` continues to work.
pub use invites::{accept_invite, create_invite};
// Crate-internal: used by the periodic federation auto-sync to re-assert
// membership to peers that don't list us back (asymmetry self-heal).
pub(crate) use invites::notify_join;
#[allow(unused_imports)]
pub use storage::{
add_node, fips_npub_for_onion, load_nodes, load_removed_dids, record_peer_transport,

View File

@ -33,6 +33,12 @@ pub async fn sync_with_peer(
.header("X-Federation-Sig", signature)
.header("X-Federation-Timestamp", timestamp)
.timeout(std::time::Duration::from_secs(30))
// Fast-fail a cold/unreachable FIPS overlay (common between LAN and
// remote/Tailscale peers that share no FIPS path) so the Tor fallback —
// which answers an onion in ~3-5s — isn't stuck behind the full 30s FIPS
// budget. Without this, a state sync to a FIPS-unreachable peer "took
// ages" and join/sync appeared to time out even though Tor was healthy.
.fips_timeout(std::time::Duration::from_secs(6))
.send_json(&body)
.await
.context("Failed to reach federated peer")?;

View File

@ -428,6 +428,99 @@ impl Server {
});
}
// Periodic federation auto-sync. Pulls every federated peer's state on a
// timer so renamed nodes and roster changes propagate WITHOUT a manual
// "Sync" click. Each sync now fast-fails a dead FIPS path and falls back
// to Tor (~3-5s), so a full pass over a handful of peers is quick.
{
let data_dir = config.data_dir.clone();
let state = state_manager.clone();
tokio::spawn(async move {
// Delay the first pass so Tor/onion publishing settles after boot.
tokio::time::sleep(Duration::from_secs(20)).await;
let mut interval = tokio::time::interval(Duration::from_secs(90));
loop {
interval.tick().await;
let nodes = match crate::federation::load_nodes(&data_dir).await {
Ok(n) if !n.is_empty() => n,
_ => continue,
};
let (snap, _) = state.get_snapshot().await;
let local_did =
match crate::identity::did_key_from_pubkey_hex(&snap.server_info.pubkey) {
Ok(d) => d,
Err(_) => continue,
};
let identity_dir = data_dir.join("identity");
let node_identity =
match crate::identity::NodeIdentity::load_or_create(&identity_dir).await {
Ok(id) => id,
Err(_) => continue,
};
// Our own identity, for re-asserting membership to any peer
// that doesn't list us back (asymmetry self-heal, below).
let local_onion = snap.server_info.tor_address.clone().unwrap_or_default();
let local_pubkey = snap.server_info.pubkey.clone();
let local_name = snap.server_info.name.clone();
let local_fips_npub =
crate::identity::fips_npub(&identity_dir).await.unwrap_or(None);
let mut ok = 0usize;
let mut healed = 0usize;
for node in &nodes {
if node.trust_level == crate::federation::TrustLevel::Untrusted {
continue;
}
match crate::federation::sync_with_peer(&data_dir, node, &local_did, |b| {
node_identity.sign(b)
})
.await
{
Ok(state) => {
ok += 1;
// Asymmetry self-heal: if this peer's exported
// trusted list doesn't include us, our original
// peer-joined never landed (e.g. it was sent
// before the reliable-notify fix, or the peer was
// down). Re-assert membership over the now
// FIPS-fast-failing/Tor path so they add us back.
// Without this, a node that joined everyone stays
// invisible to the whole fleet until a manual
// re-add (the "X250-EXP missing everywhere" case).
let they_list_us = state
.federated_peers
.iter()
.any(|h| h.did == local_did);
if !they_list_us && !local_onion.is_empty() {
crate::federation::notify_join(
&node.onion,
node.fips_npub.as_deref(),
&local_did,
&local_onion,
&local_pubkey,
local_fips_npub.as_deref(),
local_name.as_deref(),
|b| node_identity.sign(b),
)
.await
.ok();
healed += 1;
}
}
Err(e) => {
debug!(peer = %node.did, error = %e, "federation auto-sync (non-fatal)")
}
}
}
debug!(
synced = ok,
reasserted = healed,
total = nodes.len(),
"federation auto-sync pass complete"
);
}
});
}
// Initialize container scanner — discovers installed apps from Podman/Docker
{
let scanner = create_docker_scanner(&config).await?;