fix(federation): reliable symmetric auto-federation across LAN/Tor/FIPS
Federated nodes failed to converge to full-mesh across the LAN<->Tailscale boundary: nodes were invisible to peers, sync 'took ages'/timed out, and names only updated on a manual sync. Onions were healthy in both directions (~3-5s); the failures were app-layer. - B: federation dials fast-fail a dead FIPS path via .fips_timeout(6s) in sync_with_peer + notify_join, so the Tor fallback isn't stuck behind the full 30s FIPS budget when LAN and remote peers share no FIPS path. - A: notify_join (peer-joined) now spawns with retries+backoff instead of a single awaited best-effort POST, so the join RPC returns instantly (no 'Request timeout') and the inviter reliably learns the joiner (was asymmetric). - C: new 90s periodic federation auto-sync (none existed) so renamed nodes and roster changes propagate without a manual Sync click. - self-heal: each auto-sync re-asserts membership to any peer that doesn't list us back, converging the fleet to full-mesh and healing pre-existing asymmetry with no manual re-joins. Validated live across 7 nodes: a previously fleet-invisible node became fully meshed automatically (logs: 'auto-sync ... reasserted=1', 'peer-joined ... delivered'). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
1bce694ebb
commit
837cc02812
@ -254,11 +254,57 @@ pub(crate) async fn notify_join(
|
||||
"params": params,
|
||||
});
|
||||
|
||||
let _ = crate::fips::dial::PeerRequest::new(remote_fips_npub, remote_onion, "/rpc/v1")
|
||||
.service(crate::settings::transport::PeerService::Federation)
|
||||
.timeout(std::time::Duration::from_secs(30))
|
||||
.send_json(&body)
|
||||
.await;
|
||||
// Deliver the notification in the BACKGROUND with retries, and return
|
||||
// immediately. Two reasons:
|
||||
// 1. The join RPC must not block on this. Awaiting a cold FIPS overlay
|
||||
// (no shared FIPS path between LAN and remote/Tailscale peers) stalled
|
||||
// the whole join until FIPS timed out, surfacing as "Request timeout"
|
||||
// in the UI even though the local membership was already saved.
|
||||
// 2. If this single best-effort POST failed, the inviter never learned
|
||||
// about us → asymmetric federation (they couldn't see us). Retrying in
|
||||
// the background until it lands makes federation converge to symmetric.
|
||||
// `fips_timeout` fast-fails a dead FIPS path so the Tor fallback (which
|
||||
// answers an onion in ~3-5s) is reached quickly on each attempt.
|
||||
let remote_onion = remote_onion.to_string();
|
||||
let remote_fips_npub = remote_fips_npub.map(|s| s.to_string());
|
||||
tokio::spawn(async move {
|
||||
// ~5 attempts with linear backoff: 0s, 10s, 20s, 30s, 40s — covers a
|
||||
// peer that is briefly unreachable (restarting, publishing its onion)
|
||||
// without hammering it.
|
||||
for attempt in 1..=5u32 {
|
||||
let res = crate::fips::dial::PeerRequest::new(
|
||||
remote_fips_npub.as_deref(),
|
||||
&remote_onion,
|
||||
"/rpc/v1",
|
||||
)
|
||||
.service(crate::settings::transport::PeerService::Federation)
|
||||
.timeout(std::time::Duration::from_secs(30))
|
||||
.fips_timeout(std::time::Duration::from_secs(6))
|
||||
.send_json(&body)
|
||||
.await;
|
||||
match res {
|
||||
Ok((resp, transport)) if resp.status().is_success() => {
|
||||
tracing::info!(
|
||||
attempt,
|
||||
transport = %transport,
|
||||
"peer-joined notification delivered to inviter"
|
||||
);
|
||||
return;
|
||||
}
|
||||
Ok((resp, _)) => tracing::warn!(
|
||||
attempt,
|
||||
status = %resp.status(),
|
||||
"peer-joined notification rejected; will retry"
|
||||
),
|
||||
Err(e) => tracing::warn!(attempt, error = %e, "peer-joined notification failed; will retry"),
|
||||
}
|
||||
tokio::time::sleep(std::time::Duration::from_secs(10 * attempt as u64)).await;
|
||||
}
|
||||
tracing::warn!(
|
||||
onion = %remote_onion,
|
||||
"peer-joined notification gave up after retries — peer may not see us until next sync"
|
||||
);
|
||||
});
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@ -12,6 +12,9 @@ mod types;
|
||||
|
||||
// Re-export all public items so `crate::federation::*` continues to work.
|
||||
pub use invites::{accept_invite, create_invite};
|
||||
// Crate-internal: used by the periodic federation auto-sync to re-assert
|
||||
// membership to peers that don't list us back (asymmetry self-heal).
|
||||
pub(crate) use invites::notify_join;
|
||||
#[allow(unused_imports)]
|
||||
pub use storage::{
|
||||
add_node, fips_npub_for_onion, load_nodes, load_removed_dids, record_peer_transport,
|
||||
|
||||
@ -33,6 +33,12 @@ pub async fn sync_with_peer(
|
||||
.header("X-Federation-Sig", signature)
|
||||
.header("X-Federation-Timestamp", timestamp)
|
||||
.timeout(std::time::Duration::from_secs(30))
|
||||
// Fast-fail a cold/unreachable FIPS overlay (common between LAN and
|
||||
// remote/Tailscale peers that share no FIPS path) so the Tor fallback —
|
||||
// which answers an onion in ~3-5s — isn't stuck behind the full 30s FIPS
|
||||
// budget. Without this, a state sync to a FIPS-unreachable peer "took
|
||||
// ages" and join/sync appeared to time out even though Tor was healthy.
|
||||
.fips_timeout(std::time::Duration::from_secs(6))
|
||||
.send_json(&body)
|
||||
.await
|
||||
.context("Failed to reach federated peer")?;
|
||||
|
||||
@ -428,6 +428,99 @@ impl Server {
|
||||
});
|
||||
}
|
||||
|
||||
// Periodic federation auto-sync. Pulls every federated peer's state on a
|
||||
// timer so renamed nodes and roster changes propagate WITHOUT a manual
|
||||
// "Sync" click. Each sync now fast-fails a dead FIPS path and falls back
|
||||
// to Tor (~3-5s), so a full pass over a handful of peers is quick.
|
||||
{
|
||||
let data_dir = config.data_dir.clone();
|
||||
let state = state_manager.clone();
|
||||
tokio::spawn(async move {
|
||||
// Delay the first pass so Tor/onion publishing settles after boot.
|
||||
tokio::time::sleep(Duration::from_secs(20)).await;
|
||||
let mut interval = tokio::time::interval(Duration::from_secs(90));
|
||||
loop {
|
||||
interval.tick().await;
|
||||
let nodes = match crate::federation::load_nodes(&data_dir).await {
|
||||
Ok(n) if !n.is_empty() => n,
|
||||
_ => continue,
|
||||
};
|
||||
let (snap, _) = state.get_snapshot().await;
|
||||
let local_did =
|
||||
match crate::identity::did_key_from_pubkey_hex(&snap.server_info.pubkey) {
|
||||
Ok(d) => d,
|
||||
Err(_) => continue,
|
||||
};
|
||||
let identity_dir = data_dir.join("identity");
|
||||
let node_identity =
|
||||
match crate::identity::NodeIdentity::load_or_create(&identity_dir).await {
|
||||
Ok(id) => id,
|
||||
Err(_) => continue,
|
||||
};
|
||||
// Our own identity, for re-asserting membership to any peer
|
||||
// that doesn't list us back (asymmetry self-heal, below).
|
||||
let local_onion = snap.server_info.tor_address.clone().unwrap_or_default();
|
||||
let local_pubkey = snap.server_info.pubkey.clone();
|
||||
let local_name = snap.server_info.name.clone();
|
||||
let local_fips_npub =
|
||||
crate::identity::fips_npub(&identity_dir).await.unwrap_or(None);
|
||||
let mut ok = 0usize;
|
||||
let mut healed = 0usize;
|
||||
for node in &nodes {
|
||||
if node.trust_level == crate::federation::TrustLevel::Untrusted {
|
||||
continue;
|
||||
}
|
||||
match crate::federation::sync_with_peer(&data_dir, node, &local_did, |b| {
|
||||
node_identity.sign(b)
|
||||
})
|
||||
.await
|
||||
{
|
||||
Ok(state) => {
|
||||
ok += 1;
|
||||
// Asymmetry self-heal: if this peer's exported
|
||||
// trusted list doesn't include us, our original
|
||||
// peer-joined never landed (e.g. it was sent
|
||||
// before the reliable-notify fix, or the peer was
|
||||
// down). Re-assert membership over the now
|
||||
// FIPS-fast-failing/Tor path so they add us back.
|
||||
// Without this, a node that joined everyone stays
|
||||
// invisible to the whole fleet until a manual
|
||||
// re-add (the "X250-EXP missing everywhere" case).
|
||||
let they_list_us = state
|
||||
.federated_peers
|
||||
.iter()
|
||||
.any(|h| h.did == local_did);
|
||||
if !they_list_us && !local_onion.is_empty() {
|
||||
crate::federation::notify_join(
|
||||
&node.onion,
|
||||
node.fips_npub.as_deref(),
|
||||
&local_did,
|
||||
&local_onion,
|
||||
&local_pubkey,
|
||||
local_fips_npub.as_deref(),
|
||||
local_name.as_deref(),
|
||||
|b| node_identity.sign(b),
|
||||
)
|
||||
.await
|
||||
.ok();
|
||||
healed += 1;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
debug!(peer = %node.did, error = %e, "federation auto-sync (non-fatal)")
|
||||
}
|
||||
}
|
||||
}
|
||||
debug!(
|
||||
synced = ok,
|
||||
reasserted = healed,
|
||||
total = nodes.len(),
|
||||
"federation auto-sync pass complete"
|
||||
);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// Initialize container scanner — discovers installed apps from Podman/Docker
|
||||
{
|
||||
let scanner = create_docker_scanner(&config).await?;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user