From 837cc02812e6529b626c601b7c35aa7e3d9e607d Mon Sep 17 00:00:00 2001 From: archipelago Date: Fri, 19 Jun 2026 09:50:10 -0400 Subject: [PATCH] fix(federation): reliable symmetric auto-federation across LAN/Tor/FIPS Federated nodes failed to converge to full-mesh across the LAN<->Tailscale boundary: nodes were invisible to peers, sync 'took ages'/timed out, and names only updated on a manual sync. Onions were healthy in both directions (~3-5s); the failures were app-layer. - B: federation dials fast-fail a dead FIPS path via .fips_timeout(6s) in sync_with_peer + notify_join, so the Tor fallback isn't stuck behind the full 30s FIPS budget when LAN and remote peers share no FIPS path. - A: notify_join (peer-joined) now spawns with retries+backoff instead of a single awaited best-effort POST, so the join RPC returns instantly (no 'Request timeout') and the inviter reliably learns the joiner (was asymmetric). - C: new 90s periodic federation auto-sync (none existed) so renamed nodes and roster changes propagate without a manual Sync click. - self-heal: each auto-sync re-asserts membership to any peer that doesn't list us back, converging the fleet to full-mesh and healing pre-existing asymmetry with no manual re-joins. Validated live across 7 nodes: a previously fleet-invisible node became fully meshed automatically (logs: 'auto-sync ... reasserted=1', 'peer-joined ... delivered'). Co-Authored-By: Claude Opus 4.8 (1M context) --- core/archipelago/src/federation/invites.rs | 56 +++++++++++-- core/archipelago/src/federation/mod.rs | 3 + core/archipelago/src/federation/sync.rs | 6 ++ core/archipelago/src/server.rs | 93 ++++++++++++++++++++++ 4 files changed, 153 insertions(+), 5 deletions(-) diff --git a/core/archipelago/src/federation/invites.rs b/core/archipelago/src/federation/invites.rs index c88d4062..ecb8a63c 100644 --- a/core/archipelago/src/federation/invites.rs +++ b/core/archipelago/src/federation/invites.rs @@ -254,11 +254,57 @@ pub(crate) async fn notify_join( "params": params, }); - let _ = crate::fips::dial::PeerRequest::new(remote_fips_npub, remote_onion, "/rpc/v1") - .service(crate::settings::transport::PeerService::Federation) - .timeout(std::time::Duration::from_secs(30)) - .send_json(&body) - .await; + // Deliver the notification in the BACKGROUND with retries, and return + // immediately. Two reasons: + // 1. The join RPC must not block on this. Awaiting a cold FIPS overlay + // (no shared FIPS path between LAN and remote/Tailscale peers) stalled + // the whole join until FIPS timed out, surfacing as "Request timeout" + // in the UI even though the local membership was already saved. + // 2. If this single best-effort POST failed, the inviter never learned + // about us → asymmetric federation (they couldn't see us). Retrying in + // the background until it lands makes federation converge to symmetric. + // `fips_timeout` fast-fails a dead FIPS path so the Tor fallback (which + // answers an onion in ~3-5s) is reached quickly on each attempt. + let remote_onion = remote_onion.to_string(); + let remote_fips_npub = remote_fips_npub.map(|s| s.to_string()); + tokio::spawn(async move { + // ~5 attempts with linear backoff: 0s, 10s, 20s, 30s, 40s — covers a + // peer that is briefly unreachable (restarting, publishing its onion) + // without hammering it. + for attempt in 1..=5u32 { + let res = crate::fips::dial::PeerRequest::new( + remote_fips_npub.as_deref(), + &remote_onion, + "/rpc/v1", + ) + .service(crate::settings::transport::PeerService::Federation) + .timeout(std::time::Duration::from_secs(30)) + .fips_timeout(std::time::Duration::from_secs(6)) + .send_json(&body) + .await; + match res { + Ok((resp, transport)) if resp.status().is_success() => { + tracing::info!( + attempt, + transport = %transport, + "peer-joined notification delivered to inviter" + ); + return; + } + Ok((resp, _)) => tracing::warn!( + attempt, + status = %resp.status(), + "peer-joined notification rejected; will retry" + ), + Err(e) => tracing::warn!(attempt, error = %e, "peer-joined notification failed; will retry"), + } + tokio::time::sleep(std::time::Duration::from_secs(10 * attempt as u64)).await; + } + tracing::warn!( + onion = %remote_onion, + "peer-joined notification gave up after retries — peer may not see us until next sync" + ); + }); Ok(()) } diff --git a/core/archipelago/src/federation/mod.rs b/core/archipelago/src/federation/mod.rs index 11da5736..3e181fb4 100644 --- a/core/archipelago/src/federation/mod.rs +++ b/core/archipelago/src/federation/mod.rs @@ -12,6 +12,9 @@ mod types; // Re-export all public items so `crate::federation::*` continues to work. pub use invites::{accept_invite, create_invite}; +// Crate-internal: used by the periodic federation auto-sync to re-assert +// membership to peers that don't list us back (asymmetry self-heal). +pub(crate) use invites::notify_join; #[allow(unused_imports)] pub use storage::{ add_node, fips_npub_for_onion, load_nodes, load_removed_dids, record_peer_transport, diff --git a/core/archipelago/src/federation/sync.rs b/core/archipelago/src/federation/sync.rs index d7b1762e..5c460ca4 100644 --- a/core/archipelago/src/federation/sync.rs +++ b/core/archipelago/src/federation/sync.rs @@ -33,6 +33,12 @@ pub async fn sync_with_peer( .header("X-Federation-Sig", signature) .header("X-Federation-Timestamp", timestamp) .timeout(std::time::Duration::from_secs(30)) + // Fast-fail a cold/unreachable FIPS overlay (common between LAN and + // remote/Tailscale peers that share no FIPS path) so the Tor fallback — + // which answers an onion in ~3-5s — isn't stuck behind the full 30s FIPS + // budget. Without this, a state sync to a FIPS-unreachable peer "took + // ages" and join/sync appeared to time out even though Tor was healthy. + .fips_timeout(std::time::Duration::from_secs(6)) .send_json(&body) .await .context("Failed to reach federated peer")?; diff --git a/core/archipelago/src/server.rs b/core/archipelago/src/server.rs index 38b31a85..ece04176 100644 --- a/core/archipelago/src/server.rs +++ b/core/archipelago/src/server.rs @@ -428,6 +428,99 @@ impl Server { }); } + // Periodic federation auto-sync. Pulls every federated peer's state on a + // timer so renamed nodes and roster changes propagate WITHOUT a manual + // "Sync" click. Each sync now fast-fails a dead FIPS path and falls back + // to Tor (~3-5s), so a full pass over a handful of peers is quick. + { + let data_dir = config.data_dir.clone(); + let state = state_manager.clone(); + tokio::spawn(async move { + // Delay the first pass so Tor/onion publishing settles after boot. + tokio::time::sleep(Duration::from_secs(20)).await; + let mut interval = tokio::time::interval(Duration::from_secs(90)); + loop { + interval.tick().await; + let nodes = match crate::federation::load_nodes(&data_dir).await { + Ok(n) if !n.is_empty() => n, + _ => continue, + }; + let (snap, _) = state.get_snapshot().await; + let local_did = + match crate::identity::did_key_from_pubkey_hex(&snap.server_info.pubkey) { + Ok(d) => d, + Err(_) => continue, + }; + let identity_dir = data_dir.join("identity"); + let node_identity = + match crate::identity::NodeIdentity::load_or_create(&identity_dir).await { + Ok(id) => id, + Err(_) => continue, + }; + // Our own identity, for re-asserting membership to any peer + // that doesn't list us back (asymmetry self-heal, below). + let local_onion = snap.server_info.tor_address.clone().unwrap_or_default(); + let local_pubkey = snap.server_info.pubkey.clone(); + let local_name = snap.server_info.name.clone(); + let local_fips_npub = + crate::identity::fips_npub(&identity_dir).await.unwrap_or(None); + let mut ok = 0usize; + let mut healed = 0usize; + for node in &nodes { + if node.trust_level == crate::federation::TrustLevel::Untrusted { + continue; + } + match crate::federation::sync_with_peer(&data_dir, node, &local_did, |b| { + node_identity.sign(b) + }) + .await + { + Ok(state) => { + ok += 1; + // Asymmetry self-heal: if this peer's exported + // trusted list doesn't include us, our original + // peer-joined never landed (e.g. it was sent + // before the reliable-notify fix, or the peer was + // down). Re-assert membership over the now + // FIPS-fast-failing/Tor path so they add us back. + // Without this, a node that joined everyone stays + // invisible to the whole fleet until a manual + // re-add (the "X250-EXP missing everywhere" case). + let they_list_us = state + .federated_peers + .iter() + .any(|h| h.did == local_did); + if !they_list_us && !local_onion.is_empty() { + crate::federation::notify_join( + &node.onion, + node.fips_npub.as_deref(), + &local_did, + &local_onion, + &local_pubkey, + local_fips_npub.as_deref(), + local_name.as_deref(), + |b| node_identity.sign(b), + ) + .await + .ok(); + healed += 1; + } + } + Err(e) => { + debug!(peer = %node.did, error = %e, "federation auto-sync (non-fatal)") + } + } + } + debug!( + synced = ok, + reasserted = healed, + total = nodes.len(), + "federation auto-sync pass complete" + ); + } + }); + } + // Initialize container scanner — discovers installed apps from Podman/Docker { let scanner = create_docker_scanner(&config).await?;