fix(federation): reliable symmetric auto-federation across LAN/Tor/FIPS
Federated nodes failed to converge to full-mesh across the LAN<->Tailscale boundary: nodes were invisible to peers, sync 'took ages'/timed out, and names only updated on a manual sync. Onions were healthy in both directions (~3-5s); the failures were app-layer. - B: federation dials fast-fail a dead FIPS path via .fips_timeout(6s) in sync_with_peer + notify_join, so the Tor fallback isn't stuck behind the full 30s FIPS budget when LAN and remote peers share no FIPS path. - A: notify_join (peer-joined) now spawns with retries+backoff instead of a single awaited best-effort POST, so the join RPC returns instantly (no 'Request timeout') and the inviter reliably learns the joiner (was asymmetric). - C: new 90s periodic federation auto-sync (none existed) so renamed nodes and roster changes propagate without a manual Sync click. - self-heal: each auto-sync re-asserts membership to any peer that doesn't list us back, converging the fleet to full-mesh and healing pre-existing asymmetry with no manual re-joins. Validated live across 7 nodes: a previously fleet-invisible node became fully meshed automatically (logs: 'auto-sync ... reasserted=1', 'peer-joined ... delivered'). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
1bce694ebb
commit
837cc02812
@ -254,11 +254,57 @@ pub(crate) async fn notify_join(
|
|||||||
"params": params,
|
"params": params,
|
||||||
});
|
});
|
||||||
|
|
||||||
let _ = crate::fips::dial::PeerRequest::new(remote_fips_npub, remote_onion, "/rpc/v1")
|
// Deliver the notification in the BACKGROUND with retries, and return
|
||||||
.service(crate::settings::transport::PeerService::Federation)
|
// immediately. Two reasons:
|
||||||
.timeout(std::time::Duration::from_secs(30))
|
// 1. The join RPC must not block on this. Awaiting a cold FIPS overlay
|
||||||
.send_json(&body)
|
// (no shared FIPS path between LAN and remote/Tailscale peers) stalled
|
||||||
.await;
|
// the whole join until FIPS timed out, surfacing as "Request timeout"
|
||||||
|
// in the UI even though the local membership was already saved.
|
||||||
|
// 2. If this single best-effort POST failed, the inviter never learned
|
||||||
|
// about us → asymmetric federation (they couldn't see us). Retrying in
|
||||||
|
// the background until it lands makes federation converge to symmetric.
|
||||||
|
// `fips_timeout` fast-fails a dead FIPS path so the Tor fallback (which
|
||||||
|
// answers an onion in ~3-5s) is reached quickly on each attempt.
|
||||||
|
let remote_onion = remote_onion.to_string();
|
||||||
|
let remote_fips_npub = remote_fips_npub.map(|s| s.to_string());
|
||||||
|
tokio::spawn(async move {
|
||||||
|
// ~5 attempts with linear backoff: 0s, 10s, 20s, 30s, 40s — covers a
|
||||||
|
// peer that is briefly unreachable (restarting, publishing its onion)
|
||||||
|
// without hammering it.
|
||||||
|
for attempt in 1..=5u32 {
|
||||||
|
let res = crate::fips::dial::PeerRequest::new(
|
||||||
|
remote_fips_npub.as_deref(),
|
||||||
|
&remote_onion,
|
||||||
|
"/rpc/v1",
|
||||||
|
)
|
||||||
|
.service(crate::settings::transport::PeerService::Federation)
|
||||||
|
.timeout(std::time::Duration::from_secs(30))
|
||||||
|
.fips_timeout(std::time::Duration::from_secs(6))
|
||||||
|
.send_json(&body)
|
||||||
|
.await;
|
||||||
|
match res {
|
||||||
|
Ok((resp, transport)) if resp.status().is_success() => {
|
||||||
|
tracing::info!(
|
||||||
|
attempt,
|
||||||
|
transport = %transport,
|
||||||
|
"peer-joined notification delivered to inviter"
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
Ok((resp, _)) => tracing::warn!(
|
||||||
|
attempt,
|
||||||
|
status = %resp.status(),
|
||||||
|
"peer-joined notification rejected; will retry"
|
||||||
|
),
|
||||||
|
Err(e) => tracing::warn!(attempt, error = %e, "peer-joined notification failed; will retry"),
|
||||||
|
}
|
||||||
|
tokio::time::sleep(std::time::Duration::from_secs(10 * attempt as u64)).await;
|
||||||
|
}
|
||||||
|
tracing::warn!(
|
||||||
|
onion = %remote_onion,
|
||||||
|
"peer-joined notification gave up after retries — peer may not see us until next sync"
|
||||||
|
);
|
||||||
|
});
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -12,6 +12,9 @@ mod types;
|
|||||||
|
|
||||||
// Re-export all public items so `crate::federation::*` continues to work.
|
// Re-export all public items so `crate::federation::*` continues to work.
|
||||||
pub use invites::{accept_invite, create_invite};
|
pub use invites::{accept_invite, create_invite};
|
||||||
|
// Crate-internal: used by the periodic federation auto-sync to re-assert
|
||||||
|
// membership to peers that don't list us back (asymmetry self-heal).
|
||||||
|
pub(crate) use invites::notify_join;
|
||||||
#[allow(unused_imports)]
|
#[allow(unused_imports)]
|
||||||
pub use storage::{
|
pub use storage::{
|
||||||
add_node, fips_npub_for_onion, load_nodes, load_removed_dids, record_peer_transport,
|
add_node, fips_npub_for_onion, load_nodes, load_removed_dids, record_peer_transport,
|
||||||
|
|||||||
@ -33,6 +33,12 @@ pub async fn sync_with_peer(
|
|||||||
.header("X-Federation-Sig", signature)
|
.header("X-Federation-Sig", signature)
|
||||||
.header("X-Federation-Timestamp", timestamp)
|
.header("X-Federation-Timestamp", timestamp)
|
||||||
.timeout(std::time::Duration::from_secs(30))
|
.timeout(std::time::Duration::from_secs(30))
|
||||||
|
// Fast-fail a cold/unreachable FIPS overlay (common between LAN and
|
||||||
|
// remote/Tailscale peers that share no FIPS path) so the Tor fallback —
|
||||||
|
// which answers an onion in ~3-5s — isn't stuck behind the full 30s FIPS
|
||||||
|
// budget. Without this, a state sync to a FIPS-unreachable peer "took
|
||||||
|
// ages" and join/sync appeared to time out even though Tor was healthy.
|
||||||
|
.fips_timeout(std::time::Duration::from_secs(6))
|
||||||
.send_json(&body)
|
.send_json(&body)
|
||||||
.await
|
.await
|
||||||
.context("Failed to reach federated peer")?;
|
.context("Failed to reach federated peer")?;
|
||||||
|
|||||||
@ -428,6 +428,99 @@ impl Server {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Periodic federation auto-sync. Pulls every federated peer's state on a
|
||||||
|
// timer so renamed nodes and roster changes propagate WITHOUT a manual
|
||||||
|
// "Sync" click. Each sync now fast-fails a dead FIPS path and falls back
|
||||||
|
// to Tor (~3-5s), so a full pass over a handful of peers is quick.
|
||||||
|
{
|
||||||
|
let data_dir = config.data_dir.clone();
|
||||||
|
let state = state_manager.clone();
|
||||||
|
tokio::spawn(async move {
|
||||||
|
// Delay the first pass so Tor/onion publishing settles after boot.
|
||||||
|
tokio::time::sleep(Duration::from_secs(20)).await;
|
||||||
|
let mut interval = tokio::time::interval(Duration::from_secs(90));
|
||||||
|
loop {
|
||||||
|
interval.tick().await;
|
||||||
|
let nodes = match crate::federation::load_nodes(&data_dir).await {
|
||||||
|
Ok(n) if !n.is_empty() => n,
|
||||||
|
_ => continue,
|
||||||
|
};
|
||||||
|
let (snap, _) = state.get_snapshot().await;
|
||||||
|
let local_did =
|
||||||
|
match crate::identity::did_key_from_pubkey_hex(&snap.server_info.pubkey) {
|
||||||
|
Ok(d) => d,
|
||||||
|
Err(_) => continue,
|
||||||
|
};
|
||||||
|
let identity_dir = data_dir.join("identity");
|
||||||
|
let node_identity =
|
||||||
|
match crate::identity::NodeIdentity::load_or_create(&identity_dir).await {
|
||||||
|
Ok(id) => id,
|
||||||
|
Err(_) => continue,
|
||||||
|
};
|
||||||
|
// Our own identity, for re-asserting membership to any peer
|
||||||
|
// that doesn't list us back (asymmetry self-heal, below).
|
||||||
|
let local_onion = snap.server_info.tor_address.clone().unwrap_or_default();
|
||||||
|
let local_pubkey = snap.server_info.pubkey.clone();
|
||||||
|
let local_name = snap.server_info.name.clone();
|
||||||
|
let local_fips_npub =
|
||||||
|
crate::identity::fips_npub(&identity_dir).await.unwrap_or(None);
|
||||||
|
let mut ok = 0usize;
|
||||||
|
let mut healed = 0usize;
|
||||||
|
for node in &nodes {
|
||||||
|
if node.trust_level == crate::federation::TrustLevel::Untrusted {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
match crate::federation::sync_with_peer(&data_dir, node, &local_did, |b| {
|
||||||
|
node_identity.sign(b)
|
||||||
|
})
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(state) => {
|
||||||
|
ok += 1;
|
||||||
|
// Asymmetry self-heal: if this peer's exported
|
||||||
|
// trusted list doesn't include us, our original
|
||||||
|
// peer-joined never landed (e.g. it was sent
|
||||||
|
// before the reliable-notify fix, or the peer was
|
||||||
|
// down). Re-assert membership over the now
|
||||||
|
// FIPS-fast-failing/Tor path so they add us back.
|
||||||
|
// Without this, a node that joined everyone stays
|
||||||
|
// invisible to the whole fleet until a manual
|
||||||
|
// re-add (the "X250-EXP missing everywhere" case).
|
||||||
|
let they_list_us = state
|
||||||
|
.federated_peers
|
||||||
|
.iter()
|
||||||
|
.any(|h| h.did == local_did);
|
||||||
|
if !they_list_us && !local_onion.is_empty() {
|
||||||
|
crate::federation::notify_join(
|
||||||
|
&node.onion,
|
||||||
|
node.fips_npub.as_deref(),
|
||||||
|
&local_did,
|
||||||
|
&local_onion,
|
||||||
|
&local_pubkey,
|
||||||
|
local_fips_npub.as_deref(),
|
||||||
|
local_name.as_deref(),
|
||||||
|
|b| node_identity.sign(b),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.ok();
|
||||||
|
healed += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
debug!(peer = %node.did, error = %e, "federation auto-sync (non-fatal)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
debug!(
|
||||||
|
synced = ok,
|
||||||
|
reasserted = healed,
|
||||||
|
total = nodes.len(),
|
||||||
|
"federation auto-sync pass complete"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
// Initialize container scanner — discovers installed apps from Podman/Docker
|
// Initialize container scanner — discovers installed apps from Podman/Docker
|
||||||
{
|
{
|
||||||
let scanner = create_docker_scanner(&config).await?;
|
let scanner = create_docker_scanner(&config).await?;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user